We will make use of the Pima indian dataset again, as in the previous Markdown document. Importantly, note that in this setting we are not standardizing the predictors. This usually leads to additional difficulties as the scale of the predictors vary.
Pima <- rbind(MASS::Pima.tr, MASS::Pima.te)
y <- as.numeric(Pima$type == "Yes") # Binary outcome
X <- cbind(1, model.matrix(type ~ . - 1, data = Pima)) # Design matrixAs before, we will employ a relatively vague prior centered at \(0\), namely \(\beta \sim N(0, 100 I_p)\). Then, we implement the log-likelihood, the log-posterior and the gradient of the likelihood.
# Log-likelihood of a logistic regression model
loglik <- function(beta, y, X) {
eta <- c(X %*% beta)
sum(y * eta - log(1 + exp(eta)))
}
# Log-posterior
logpost <- function(beta, y, X) {
loglik(beta, y, X) + sum(dnorm(beta, 0, 10, log = T))
}
# Gradient of the logposterior
lgradient <- function(beta, y, X) {
probs <- plogis(c(X %*% beta))
loglik_gr <- c(crossprod(X, y - probs))
prior_gr <- -beta / 100
loglik_gr + prior_gr
}
# Summary table for the 6 considered methods
summary_tab <- matrix(0, nrow = 6, ncol = 4)
colnames(summary_tab) <- c("Seconds", "Average ESS", "Average ESS per sec", "Average acceptance rate")
rownames(summary_tab) <- c("MH Laplace + Rcpp", "MALA", "MALA tuned", "HMC", "STAN", "Pólya-Gamma")We first consider a random walk Metropolis-Hastings algorithm based on
library(coda)
R <- 30000
burn_in <- 5000set.seed(123)
# Covariance matrix is selected via laplace approximation
fit_logit <- glm(y ~ X - 1, family = binomial(link = "logit"))
p <- ncol(X)
S <- 2.38^2 * vcov(fit_logit) / p
# Running the MCMC
start.time <- Sys.time()
# MCMC
fit_MCMC <- as.mcmc(RMH_arma(R, burn_in, y, X, S)) # Convert the matrix into a "coda" object
end.time <- Sys.time()
time_in_sec <- as.numeric(difftime(end.time, start.time, units = "secs"))
# Diagnostic
summary(effectiveSize(fit_MCMC)) # Effective sample size
Min. 1st Qu. Median Mean 3rd Qu. Max.
1103 1154 1163 1166 1192 1204
summary(R / effectiveSize(fit_MCMC)) # Integrated autocorrelation time
Min. 1st Qu. Median Mean 3rd Qu. Max.
24.93 25.16 25.79 25.75 26.01 27.19
summary(1 - rejectionRate(fit_MCMC)) # Acceptance rate
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.2682 0.2682 0.2682 0.2682 0.2682 0.2682
# Summary statistics
summary_tab[1, ] <- c(
time_in_sec, mean(effectiveSize(fit_MCMC)),
mean(effectiveSize(fit_MCMC)) / time_in_sec,
1 - mean(rejectionRate(fit_MCMC))
)
# Traceplot of the intercept
plot(fit_MCMC[, 1:2])# R represent the number of samples
# burn_in is the number of discarded samples
# epsilon, S are tuning parameter
MALA <- function(R, burn_in, y, X, epsilon, S) {
p <- ncol(X)
out <- matrix(0, R, p) # Initialize an empty matrix to store the values
beta <- rep(0, p) # Initial values
A <- chol(S) # Cholesky of S
S1 <- solve(S) # Inverse of S
lgrad <- c(S %*% lgradient(beta, y, X)) # Compute the gradient
logp <- logpost(beta, y, X)
sigma2 <- epsilon^2 / p^(1 / 3)
sigma <- sqrt(sigma2)
# Starting the Gibbs sampling
for (r in 1:(burn_in + R)) {
beta_new <- beta + sigma2 / 2 * lgrad + sigma * c(crossprod(A, rnorm(p)))
logpnew <- logpost(beta_new, y, X)
lgrad_new <- c(S %*% lgradient(beta_new, y, X))
diffold <- beta - beta_new - sigma2 / 2 * lgrad_new
diffnew <- beta_new - beta - sigma2 / 2 * lgrad
qold <- -diffold %*% S1 %*% diffold / (2 * sigma2)
qnew <- -diffnew %*% S1 %*% diffnew / (2 * sigma2)
alpha <- min(1, exp(logpnew - logp + qold - qnew))
if (runif(1) < alpha) {
logp <- logpnew
lgrad <- lgrad_new
beta <- beta_new # Accept the value
}
# Store the values after the burn-in period
if (r > burn_in) {
out[r - burn_in, ] <- beta
}
}
out
}set.seed(123)
epsilon <- 0.0017 # After some trial ad error
# Running the MCMC
start.time <- Sys.time()
fit_MCMC <- as.mcmc(MALA(R = R, burn_in = burn_in, y, X, epsilon, S = diag(ncol(X)))) # Convert the matrix into a "coda" object
end.time <- Sys.time()
time_in_sec <- as.numeric(difftime(end.time, start.time, units = "secs"))
# Diagnostic
summary(effectiveSize(fit_MCMC)) # Effective sample size
Min. 1st Qu. Median Mean 3rd Qu. Max.
2.900 9.358 27.201 44.321 46.238 166.223
summary(R / effectiveSize(fit_MCMC)) # Integrated autocorrelation time
Min. 1st Qu. Median Mean 3rd Qu. Max.
180.5 728.7 1208.5 2671.3 3283.2 10343.4
summary(1 - rejectionRate(fit_MCMC)) # Acceptance rate
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.5638 0.5638 0.5638 0.5638 0.5638 0.5638
# Summary statistics
summary_tab[2, ] <- c(
time_in_sec, mean(effectiveSize(fit_MCMC)),
mean(effectiveSize(fit_MCMC)) / time_in_sec,
1 - mean(rejectionRate(fit_MCMC))
)
# Traceplot of the intercept
plot(fit_MCMC[, 1:2])library(coda)
R <- 30000
burn_in <- 5000set.seed(123)
epsilon <- 1.68 # After some trial ad error
# Covariance matrix is selected via laplace approximation
fit_logit <- glm(y ~ X - 1, family = binomial(link = "logit"))
S <- vcov(fit_logit)
# Running the MCMC
start.time <- Sys.time()
fit_MCMC <- as.mcmc(MALA(R = R, burn_in = burn_in, y, X, epsilon, S)) # Convert the matrix into a "coda" object
end.time <- Sys.time()
time_in_sec <- as.numeric(difftime(end.time, start.time, units = "secs"))
# Diagnostic
summary(effectiveSize(fit_MCMC)) # Effective sample size
Min. 1st Qu. Median Mean 3rd Qu. Max.
8583 8762 9196 9063 9312 9409
summary(R / effectiveSize(fit_MCMC)) # Integrated autocorrelation time
Min. 1st Qu. Median Mean 3rd Qu. Max.
3.189 3.222 3.263 3.314 3.424 3.495
summary(1 - rejectionRate(fit_MCMC)) # Acceptance rate
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.5686 0.5686 0.5686 0.5686 0.5686 0.5686
# Summary statistics
summary_tab[3, ] <- c(
time_in_sec, mean(effectiveSize(fit_MCMC)),
mean(effectiveSize(fit_MCMC)) / time_in_sec,
1 - mean(rejectionRate(fit_MCMC))
)
# Traceplot of the intercept
plot(fit_MCMC[, 1:2])HMC <- function(R, burn_in, y, X, epsilon, S, L = 10) {
p <- ncol(X)
out <- matrix(0, R, p) # Initialize an empty matrix to store the values
beta <- rep(0, p) # Initial values
logp <- logpost(beta, y, X) # Initial log-posterior
S1 <- solve(S)
A1 <- chol(S1)
# Starting the Gibbs sampling
for (r in 1:(burn_in + R)) {
P <- c(crossprod(A1, rnorm(p))) # Auxiliary variables
logK <- c(P %*% S %*% P / 2) # sum(P^2) / 2 # Kinetic energy at the beginning of the trajectory
# Make a half step for momentum at the beginning
beta_new <- beta
Pnew <- P + epsilon * lgradient(beta_new, y, X) / 2
# Alternate full steps for position and momentum
for (l in 1:L) {
# Make a full step for the position
beta_new <- beta_new + epsilon * c(S %*% Pnew)
# Make a full step for the momentum, except at end of trajectory
if (l != L) Pnew <- Pnew + epsilon * lgradient(beta_new, y, X)
}
# Make a half step for momentum at the end.
Pnew <- Pnew + epsilon * lgradient(beta_new, y, X) / 2
# Negate momentum at end of trajectory to make the proposal symmetric
Pnew <- - Pnew
# Evaluate potential and kinetic energies at the end of trajectory
logpnew <- logpost(beta_new, y, X)
logKnew <- Pnew %*% S %*% Pnew / 2 #sum(Pnew^2) / 2
# Accept or reject the state at end of trajectory, returning either
# the position at the end of the trajectory or the initial position
if (runif(1) < exp(logpnew - logp + logK - logKnew)) {
logp <- logpnew
beta <- beta_new # Accept the value
}
# Store the values after the burn-in period
if (r > burn_in) {
out[r - burn_in, ] <- beta
}
}
out
}set.seed(123)
epsilon <- 0.25 # After some trial ad error
L <- 10
# Covariance matrix is selected via laplace approximation
fit_logit <- glm(y ~ X - 1, family = binomial(link = "logit"))
S <- vcov(fit_logit)
# Running the MCMC
start.time <- Sys.time()
fit_MCMC <- as.mcmc(HMC(R = R, burn_in = burn_in, y, X, epsilon, S, L)) # Convert the matrix into a "coda" object
end.time <- Sys.time()
time_in_sec <- as.numeric(difftime(end.time, start.time, units = "secs"))
# Diagnostic
summary(effectiveSize(fit_MCMC)) # Effective sample size
Min. 1st Qu. Median Mean 3rd Qu. Max.
215765 222946 226610 225565 228360 233334
summary(R / effectiveSize(fit_MCMC)) # Integrated autocorrelation time
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.1286 0.1314 0.1324 0.1331 0.1346 0.1390
summary(1 - rejectionRate(fit_MCMC)) # Acceptance rate
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.9892 0.9892 0.9892 0.9892 0.9892 0.9892
# Summary statistics
summary_tab[4, ] <- c(
time_in_sec, mean(effectiveSize(fit_MCMC)),
mean(effectiveSize(fit_MCMC)) / time_in_sec,
1 - mean(rejectionRate(fit_MCMC))
)
# Traceplot of the intercept
plot(fit_MCMC[, 1:2])library(rstan)
# I am not counting the compilation time
stan_compiled <- stan_model(file = "logistic.stan") # Stan programset.seed(1234)
# Running the MCMC
start.time <- Sys.time()
fit_HMC <- sampling(
stan_compiled, # The stan file has been previously compiled
data = list(X = X, y = y, n = nrow(X), p = ncol(X)), # named list of data
chains = 1, # number of Markov chains
warmup = burn_in, # Burn-in iterations per chain
iter = R + burn_in # Total number of iterations per chain
)
SAMPLING FOR MODEL 'logistic' NOW (CHAIN 1).
Chain 1:
Chain 1: Gradient evaluation took 9.6e-05 seconds
Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 0.96 seconds.
Chain 1: Adjust your expectations accordingly!
Chain 1:
Chain 1:
Chain 1: Iteration: 1 / 35000 [ 0%] (Warmup)
Chain 1: Iteration: 3500 / 35000 [ 10%] (Warmup)
Chain 1: Iteration: 5001 / 35000 [ 14%] (Sampling)
Chain 1: Iteration: 8500 / 35000 [ 24%] (Sampling)
Chain 1: Iteration: 12000 / 35000 [ 34%] (Sampling)
Chain 1: Iteration: 15500 / 35000 [ 44%] (Sampling)
Chain 1: Iteration: 19000 / 35000 [ 54%] (Sampling)
Chain 1: Iteration: 22500 / 35000 [ 64%] (Sampling)
Chain 1: Iteration: 26000 / 35000 [ 74%] (Sampling)
Chain 1: Iteration: 29500 / 35000 [ 84%] (Sampling)
Chain 1: Iteration: 33000 / 35000 [ 94%] (Sampling)
Chain 1: Iteration: 35000 / 35000 [100%] (Sampling)
Chain 1:
Chain 1: Elapsed Time: 10.9396 seconds (Warm-up)
Chain 1: 67.1344 seconds (Sampling)
Chain 1: 78.0739 seconds (Total)
Chain 1:
end.time <- Sys.time()
time_in_sec <- as.numeric(difftime(end.time, start.time, units = "secs"))
fit_HMC <- as.mcmc(extract(fit_HMC)$beta)
# Diagnostic
summary(effectiveSize(fit_HMC)) # Effective sample size
Min. 1st Qu. Median Mean 3rd Qu. Max.
28917 30000 30000 29865 30000 30000
summary(R / effectiveSize(fit_HMC)) # Integrated autocorrelation time
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.000 1.000 1.000 1.005 1.000 1.037
summary(1 - rejectionRate(fit_HMC)) # Acceptance rate
Min. 1st Qu. Median Mean 3rd Qu. Max.
1 1 1 1 1 1
# Summary statistics
summary_tab[5, ] <- c(
time_in_sec, mean(effectiveSize(fit_HMC)),
mean(effectiveSize(fit_HMC)) / time_in_sec,
1 - mean(rejectionRate(fit_HMC))
)
# Traceplot of the intercept
plot(fit_HMC[, 1:2])library(BayesLogit)
logit_Gibbs <- function(R, burn_in, y, X, B, b) {
p <- ncol(X)
n <- nrow(X)
out <- matrix(0, R, p) # Initialize an empty matrix to store the values
P <- solve(B) # Prior precision matrix
Pb <- P %*% b # Term appearing in the Gibbs sampling
Xy <- crossprod(X, y - 1 / 2)
# Initialization
beta <- rep(0, p)
# Iterative procedure
for (r in 1:(R + burn_in)) {
# Sampling the Pólya-gamma latent variables
eta <- c(X %*% beta)
omega <- rpg.devroye(num = n, h = 1, z = eta)
# Sampling beta
eig <- eigen(crossprod(X * sqrt(omega)) + P, symmetric = TRUE)
Sigma <- crossprod(t(eig$vectors) / sqrt(eig$values))
mu <- Sigma %*% (Xy + Pb)
A1 <- t(eig$vectors) / sqrt(eig$values)
beta <- mu + c(matrix(rnorm(1 * p), 1, p) %*% A1)
# Store the values after the burn-in period
if (r > burn_in) {
out[r - burn_in, ] <- beta
}
}
out
}B <- diag(100, ncol(X)) # Prior covariance matrix
b <- rep(0, ncol(X)) # Prior meanset.seed(123)
# Running the MCMC
start.time <- Sys.time()
fit_MCMC <- as.mcmc(logit_Gibbs(R, burn_in, y, X, B, b)) # Convert the matrix into a "coda" object
end.time <- Sys.time()
time_in_sec <- as.numeric(difftime(end.time, start.time, units = "secs"))
# Diagnostic
summary(effectiveSize(fit_MCMC)) # Effective sample size
Min. 1st Qu. Median Mean 3rd Qu. Max.
10018 13592 15411 15182 17369 18900
summary(R / effectiveSize(fit_MCMC)) # Integrated autocorrelation time
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.587 1.728 1.950 2.051 2.209 2.995
summary(1 - rejectionRate(fit_MCMC)) # Acceptance rate
Min. 1st Qu. Median Mean 3rd Qu. Max.
1 1 1 1 1 1
# Summary statistics
summary_tab[6, ] <- c(
time_in_sec, mean(effectiveSize(fit_MCMC)),
mean(effectiveSize(fit_MCMC)) / time_in_sec,
1 - mean(rejectionRate(fit_MCMC))
)
# Traceplot of the intercept
plot(fit_MCMC[, 1:2])| Seconds | Average ESS | Average ESS per sec | Average acceptance rate | |
|---|---|---|---|---|
| MH Laplace + Rcpp | 0.561887 | 1165.75640 | 2074.71670 | 0.2682089 |
| MALA | 2.743667 | 44.32131 | 16.15404 | 0.5637855 |
| MALA tuned | 2.624531 | 9063.31983 | 3453.31022 | 0.5686190 |
| HMC | 14.735173 | 225565.16881 | 15307.94169 | 0.9892330 |
| STAN | 78.846092 | 29864.58683 | 378.77067 | 1.0000000 |
| Pólya-Gamma | 10.254378 | 15181.91528 | 1480.53009 | 1.0000000 |